1 Introduction

1.1 Business problem

Maverik is interested in producing more accurate financial plans and initial ROI documents for future convenience store locations. Considerable uncertainty is associated with deploying new locations in their network and being able to properly allocate resources and accurately predict profitability is crucial to the prosperity of their business. To this end, this project aims to augment veracity of financial plans and ROI documents by leveraging the store-level time series and qualitative data collected by Maverik. This will be done by employing an ensemble of forecasting and supervised regression models designed to provide daily store level sales forecasts of multiple key product categories. Success of this project will be benchmarked against Maverik’s existing Naive forecasting solution and the proposed model will be tuned to minimize:

  • Akaike Information Criterion (AIC)
  • Root Mean Squared Error (RMSE)
  • Mean Absolute Percentage Error (MAPE)

Key abilities of the final must include:

  • Calculation of daily level, seasonally sensitive forecast for each of the sales metrics defined by Maverik.
  • Functionality to create updated forecasts given new data.

The ability to accurately forecast store sales will enable Maverik to optimize the expenditure of scarce resources by pursuing the most profitable locations and minimizing misallocation of said resources when opening a new store. This will lead to better investment, decreased waste, and more reliable financial evaluations.

1.2 EDA Objectives

As well as gaining a greater holistic understanding of the provided data, this EDA aims to answer the following questions:

  • What preprocessing/cleaning is required?
    • What is the scale of missing values? How should they be handled?
    • Do erroneously encoded categorical variables need to be corrected?
    • How do date attributes need to be standardized or formatted to ensure accurate seasonality calculations?
    • Is the existing data “tidy”?
      • Each variable has its own column
      • Each observation has its own row
      • Each value has its own cell
  • Which features are most correlated with each of the target sales metrics?
  • What explanatory variables are collinear and how should that be handled?
  • What affect does seasonality have on the target variables?

2 Loading packages and data

# Load packages 
library(lemon)
library(skimr)
library(lubridate)
library(magrittr)
library(zoo)
library(tidyverse)
library(readxl)
library(scales)
library(GGally)
library(ggrepel)
library(ggforce)
library(caret)
library(ggTimeSeries)
library(ggpointdensity)
library(fpp3)
library(patchwork)
library(plotly)

# Load data 
mvts <- read_csv("../../../data/time_series_data_msba.csv") %>%
  # removing unnamed row index column
  select(-1) %>% 
  # simplifying column names
  rename_with(~str_split_1(paste0("open_date,date,week_id,day_name,holiday,",
                                  "day_type,inside_sales,food_service,diesel,",
                                  "unleaded,site_id"), ",")) %>% 
  # set site_id as first column
  relocate(site_id) %>%
  arrange(site_id, date)

mvq <- read_csv("../../../data/qualitative_data_msba.csv") %>%
  # removing unnamed row index column
  # `RV Lanes Fueling Positions` and `Hi-Flow Lanes Fueling Positions` are duplicated columns
  select(-c(1, `RV Lanes Fueling Positions`, `Hi-Flow Lanes Fueling Positions`)) %>%
  # set site_id as first column
  select(site_id = site_id_msba, colnames(.)) %>% 
  # simplify column names
  rename_with(\(x){
    # replace spaces, slashes, and hyphens with underscores
    gsub(" +|-|/", "_", tolower(x)) %>%
      # remove single quotes and apostrophes
      gsub("'|’", "", .) %>%
      # validate variables beginning with numbers
      gsub("^(\\d)", "x\\1", .)
  }) %>%
  # distinguish from variable found in time series data
  rename(has_diesel = diesel)

3 Initial inspection

3.1 Time series data

# First look
mvts %>%
  head(5)
# Skim
mvts %>%
  mutate(site_id = as.character(site_id)) %>%
  skim() 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       Piped data
## Number of rows             13908     
## Number of columns          11        
## _______________________              
## Column type frequency:               
##   character                4         
##   Date                     2         
##   numeric                  5         
## ________________________             
## Group variables            None      
## 
## ── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 site_id               0             1   5   5     0       38          0
## 2 day_name              0             1   6   9     0        7          0
## 3 holiday               0             1   4  22     0       26          0
## 4 day_type              0             1   7   7     0        2          0
## 
## ── Variable type: Date ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min        max        median     n_unique
## 1 open_date             0             1 2021-01-12 2022-08-16 2021-10-08       32
## 2 date                  0             1 2021-01-12 2023-08-16 2022-04-18      947
## 
## ── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate   mean     sd   p0   p25   p50   p75   p100 hist 
## 1 week_id               0             1   26.5   15.0   1    14    26    39     52  ▇▇▇▇▇
## 2 inside_sales          0             1 2847.   981.    0  2181. 2694. 3325.  7172. ▁▇▅▁▁
## 3 food_service          0             1  760.   342.    0   521.  697.  924.  2532. ▃▇▂▁▁
## 4 diesel                0             1 1703.  2161.    0   383. 1019. 2283. 20854. ▇▁▁▁▁
## 5 unleaded              0             1 2382.  1026.  240. 1654. 2257. 2928.  8077. ▅▇▂▁▁

The time series data contains the daily, store level sales of the four target variables: inside_sales, food_service, diesel, and unleaded. Other data pertaining to dates are provided herein, including:

  • week_id: Fiscal week number
  • day_name: Day of the week name
  • open_date: Date the store opened
  • holiday: What holiday (if any) occured on that day
  • day_type: Either “WEEKDAY” or “WEEKEND”

None of the variables have any missing values that will have to be dealt with.

3.2 Qualitative data

# First look
mvq %>%
  head(5)
# Skim
mvq %>%
  mutate(site_id = as.character(site_id)) %>%
  skim() 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       Piped data
## Number of rows             37        
## Number of columns          52        
## _______________________              
## Column type frequency:               
##   character                28        
##   numeric                  24        
## ________________________             
## Group variables            None      
## 
## ── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##    skim_variable                    n_missing complete_rate min max empty n_unique whitespace
##  1 site_id                                  0             1   5   5     0       37          0
##  2 lottery                                  0             1   2   3     0        2          0
##  3 freal                                    0             1   2   3     0        2          0
##  4 bonfire_grill                            0             1   2   3     0        2          0
##  5 pizza                                    0             1   2   3     0        2          0
##  6 cinnabon                                 0             1   2   3     0        2          0
##  7 godfathers_pizza                         0             1   2   2     0        1          0
##  8 ethanol_free                             0             1   2   3     0        2          0
##  9 has_diesel                               0             1   3   3     0        1          0
## 10 hi_flow_lanes                            0             1   2   3     0        2          0
## 11 rv_lanes                                 0             1   2   3     0        2          0
## 12 hi_flow_rv_lanes                         0             1   2   3     0        2          0
## 13 def                                      0             1   2   3     0        2          0
## 14 cat_scales                               0             1   2   3     0        2          0
## 15 car_wash                                 0             1   2   2     0        1          0
## 16 ev_charging                              0             1   2   2     0        1          0
## 17 rv_dumps                                 0             1   2   3     0        2          0
## 18 propane                                  0             1   2   3     0        2          0
## 19 traditional_forecourt_layout             0             1   5   7     0        2          0
## 20 traditional_forecourt_stack_type         0             1   4  11     0        3          0
## 21 rv_lanes_layout                          0             1   3   7     0        3          0
## 22 rv_lanes_stack_type                      0             1   3   5     0        3          0
## 23 hi_flow_lanes_layout                     0             1   3   5     0        3          0
## 24 hi_flow_lanes_stack_type                 0             1   3   5     0        2          0
## 25 hi_flow_rv_lanes_layout                  0             1   3   7     0        4          0
## 26 hi_flow_rv_lanes_stack_type              0             1   3   5     0        3          0
## 27 non_24_hour                              0             1   2   2     0        1          0
## 28 self_check_out                           0             1   3   3     0        1          0
## 
## ── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##    skim_variable                           n_missing complete_rate     mean        sd    p0   p25   p50   p75   p100 hist 
##  1 open_year                                       0             1  2021.       0.475  2021  2021  2021  2022   2022 ▇▁▁▁▃
##  2 square_feet                                     0             1  4970.     576.     2933  5046  5046  5046   6134 ▁▁▁▇▁
##  3 front_door_count                                0             1     2        0         2     2     2     2      2 ▁▁▇▁▁
##  4 years_since_last_project                        0             1     1.65     0.484     1     1     2     2      2 ▅▁▁▁▇
##  5 parking_spaces                                  0             1    37.4      5.92     23    34    38    41     49 ▂▂▇▆▃
##  6 x1_mile_pop                                     0             1  6704.    5694.        0  1984  5574 11269  18692 ▇▃▃▃▂
##  7 x1_mile_emp                                     0             1  4758.    4697.       56  1771  3895  6002  26077 ▇▃▁▁▁
##  8 x1_mile_income                                  0             1 53300.   24333.        0 39538 46356 73519 110957 ▁▇▅▅▂
##  9 x1_2_mile_pop                                   0             1  1833.    1915.        0   262  1003  2686   5923 ▇▃▁▁▂
## 10 x1_2_mile_emp                                   0             1  1514.    2489.       31   386  1037  1616  15403 ▇▁▁▁▁
## 11 x1_2_mile_income                                0             1 47012.   28092.        0 28185 44706 69253 104730 ▃▇▅▆▂
## 12 x5_min_pop                                      0             1 14529.   13419.        0  4725 12789 20792  55385 ▇▅▂▁▁
## 13 x5_min_emp                                      0             1  9122.    8657.      456  3131  6348 11479  34199 ▇▂▂▁▁
## 14 x5_min_inc                                      0             1 55292.   24372.        0 41009 50232 74911 114281 ▁▇▇▅▂
## 15 x7_min_pop                                      0             1 32304.   30875.      688 10672 24597 44247 137979 ▇▂▂▁▁
## 16 x7_min_emp                                      0             1 19224.   20477.      615  6870 11584 26238  83985 ▇▂▁▁▁
## 17 x7_min_inc                                      0             1 59850.   18967.    31540 48170 53805 77818 108534 ▅▇▁▃▁
## 18 traditional_forecourt_fueling_positions         0             1    14.3      3.95     10    12    12    16     24 ▇▂▁▂▁
## 19 hi_flow_lanes_fueling_positions                 0             1     3.32     2.93      0     0     5     5      9 ▇▁▇▃▁
## 20 rv_lanes_fueling_positions                      0             1     2.51     2.05      0     0     4     4      6 ▆▁▁▇▁
## 21 mens_toilet_count                               0             1     2.38     0.924     0     2     2     3      5 ▂▇▇▁▁
## 22 mens_urinal_count                               0             1     2.35     0.857     0     2     2     3      5 ▂▇▇▁▁
## 23 womens_toilet_count                             0             1     4.65     1.75      0     4     4     6     10 ▂▇▇▁▁
## 24 womens_sink_count                               0             1     1.70     0.740     0     1     2     2      4 ▁▅▇▁▁

The qualitative data set offers a mix of 52 categorical and numerical variables describing site attributes and demographic/socioeconomic statistics of the surrounding area. Interestingly, site #23065 is missing from this data set and will have to be removed from the time series data set.

symdiff(mvts$site_id, mvq$site_id)
## [1] 23065

mvq does not have any explicitly NA values, but does utilize character string to indicate such. Maverik has indicated that “N/A”, “None”, or any analogous value indicates the absence of that attribute at the store rather than missing data. In this case, it would be best to apply a single value for these cases instead of using explicit NA and removing.

mvq %>%
  # select columns containing "N/A" or "none"
  select(where(~any(grepl("^N/?A$|^none$", ., ignore.case = TRUE)))) %>%
  # for each column, calculate proportion of observations containing "N/A" or "none"
  summarise(
    across(
      everything(), 
      ~sum(grepl("^N/?A$|^none$", ., ignore.case = TRUE)) / n()
    )
  ) %>%
  # Covert to long-form
  pivot_longer(everything(),
               names_to = "col",
               values_to = "prop_na")
## # A tibble: 7 × 2
##   col                              prop_na
##   <chr>                              <dbl>
## 1 traditional_forecourt_stack_type   0.730
## 2 rv_lanes_layout                    0.378
## 3 rv_lanes_stack_type                0.405
## 4 hi_flow_lanes_layout               0.405
## 5 hi_flow_lanes_stack_type           0.405
## 6 hi_flow_rv_lanes_layout            0.378
## 7 hi_flow_rv_lanes_stack_type        0.405

This can be achieved with the following:

mvq1 <- mvq %>%
  mutate(
    across(
      where(~any(grepl("^N/?A$", ., ignore.case = TRUE))),
      ~replace(., grepl("^N/?A$", ., ignore.case = TRUE), "None")
    )
  )

# Confirm same proportions as before
mvq1 %>% 
  # select columns containing missing values
  select(where(~any(. == "None"))) %>%
  # for each column, calculate proportion of missing values
  summarise(
    across(
      everything(),
    ~sum(. == "None") / n()
    )
  ) %>%
  #convert to long-form
  pivot_longer(everything(),
               names_to = "col",
               values_to = "prop_na")
## # A tibble: 7 × 2
##   col                              prop_na
##   <chr>                              <dbl>
## 1 traditional_forecourt_stack_type   0.730
## 2 rv_lanes_layout                    0.378
## 3 rv_lanes_stack_type                0.405
## 4 hi_flow_lanes_layout               0.405
## 5 hi_flow_lanes_stack_type           0.405
## 6 hi_flow_rv_lanes_layout            0.378
## 7 hi_flow_rv_lanes_stack_type        0.405

mvq also contains some zero-variance variables which would not contribute to any model and should be taken out.

mvq1 %>%
  summarise(
    across(
      everything(),
      list(
        # for each column, calculate number of distinct values
        ndistinct = ~n_distinct(., na.rm = TRUE),
        # for each column, calculate variance
        var = ~var(., na.rm = TRUE) %>% round(4),
        # for each column, concatenate up to three unique values
        samp = ~paste0(na.omit(unique(.)[1:3]), collapse = ", ")
      )
    )
  ) %>%
  #convert to long form
  pivot_longer(everything(),
               names_to = c("col", ".value"),
               names_pattern = "(.*)_(.*)") %>%
  arrange(ndistinct)

It also appears some of the fuel lane data is contradictory and/or redundant. Site 22015 has no high-flow RV lanes as indicated by hi_flow_rv_lanes but somehow their high-flow RV lane layout is “In-Line”. While a single observation can easily be handled, this instance further brings into question the pervasiveness of such errors, detectable or otherwise.

# Show seemingly contradictory combination
mvq1 %>%
  filter(hi_flow_rv_lanes == "No",
         hi_flow_rv_lanes_layout == "In-Line") %>%
  select(site_id, hi_flow_rv_lanes, hi_flow_rv_lanes_layout)
# Create list of desired lane variable combinations
sk_list <- list(
  c(
    "hi_flow_lanes",
    "hi_flow_lanes_layout"
  ),
  c(
    "hi_flow_lanes_layout",
    "rv_lanes"
  ),
  c(
    "rv_lanes",
    "rv_lanes_layout"
  ),
  c(
    "rv_lanes_layout",
    "hi_flow_rv_lanes"
  ),
  c(
    "hi_flow_rv_lanes",
    "hi_flow_rv_lanes_layout"
  ),
  c(
    "hi_flow_rv_lanes_layout",
    "rv_lanes_fueling_positions"
  ),
  c(
    "rv_lanes_fueling_positions",
    "hi_flow_lanes_fueling_positions"
  )
)

# Take each value of list and perform transformations within a data frame
sk <- lapply(sk_list,
             \(x){
               mvq1 %>%
                 # ensure compatibility across variables when changing to long form
                 mutate(across(everything(), as.character)) %>%
                 # count instances of value combinations
                 count(
                   pick(all_of(x))
                 ) %>%
                 # shorten column names for cleaner output
                 rename_with(~gsub("_lanes|_fueling", "", .)) %>%
                 # create row ids
                 mutate(id = row_number(),
                        .before = 1) %>%
                 # convert to long form
                 pivot_longer(-c(n, id)) %>%
                 # combine variable name and variable value into single column
                 unite("name",
                       name:value,
                       sep = ":\n") %>%
                 group_by(id) %>%
                 # place corresponding pair into different column
                 mutate(name2 = name[2]) %>%
                 ungroup() %>%
                 # remove duplicates
                 filter(name != name2)
             }) %>%
  # combine list output into singe data frame
  list_rbind() %>%
  # create required positional ids for sankey plot
  mutate(from_id = match(name, unique(c(name, name2))) - 1,
         to_id = match(name2, unique(c(name, name2))) - 1)

# create sankey plot
plot_ly(
  type = "sankey",
  # ensure plot covers entire space
  domain = list(
    x = c(0,1),
    y = c(0,1)
  ),
  orientation = "h",
  node = list(
    # set spacing between nodes
    pad = 15,
    thickness = 20,
    line = list(
      color = "black",
      width = 0.5
    ),
    label = unique(c(sk$name, sk$name2))
  ),
  # Define paths
  link = list(
    source = sk$from_id,
    target = sk$to_id,
    value = sk$n,
    # color concerning combinations
    color = c(rep("#bbbbbbaa", 13), "#ff8888", rep("#bbbbbbaa", 16))
  )
) %>% 
  layout(
    title = "Distribution of fuel lane descriptions",
    font = list(
      size = 10
    )
  )

Furthermore, there are only two unique combinations of rv_lanes_stack_type and hi_flow_rv_lanes_stack_type: either both “None” or both “HF/RV”. It seems extremely likely to me that “HF/RV” stands for “Hi(gh) Flow / Recreational Vehicle”.

If all of the following are true:

  • “HF/RV” stands for “Hi(gh) Flow / Recreational Vehicle”
  • “N/A” and “None” mean the absence of the attribute and not the absence of the data
  • The absence of any “layout” or “stack type” is equivalent to the absence of the lanes/pumps entirely

then it is necessarily true that all RV lanes in this data set are hi-flow. Site 22015 is the only observation which violates this condition. With so few sites to begin with, I’m hesitant to remove site 22015 from the data and instead would prefer to manually correct the data. A final decision will be made before modeling. In any case, the lane/pump data can be simplified to remove redundant data. This will also occur at a later time before modeling.

mvq1 %>%
  count(rv_lanes,
        hi_flow_lanes,
        rv_lanes_stack_type,
        hi_flow_rv_lanes_stack_type)
# Applying all basic cleaning steps and assigning to original object name
mvts <- read_csv("../../../data/time_series_data_msba.csv") %>%
  # removing unnamed row index column
  select(-1) %>% 
  # simplifying column names
  rename_with(~str_split_1(paste0("open_date,date,week_id,day_name,holiday,",
                                  "day_type,inside_sales,food_service,diesel,",
                                  "unleaded,site_id"), ",")) %>% 
  # set site_id as first column
  relocate(site_id) %>%
  arrange(site_id, date) %>%
  # removing store not found in `mvq`
  filter(site_id != 23065)

mvq <- read_csv("../../../data/qualitative_data_msba.csv") %>%
  # removing unnamed row index column
  # `RV Lanes Fueling Positions` and `Hi-Flow Lanes Fueling Positions` are duplicated columns
  select(-c(1, `RV Lanes Fueling Positions`, `Hi-Flow Lanes Fueling Positions`)) %>%
  # set site_id as first column
  select(site_id = site_id_msba, colnames(.)) %>% 
  # simplify column names
  rename_with(\(x){
    # replace spaces, slashes, and hyphens with underscores
    gsub(" +|-|/", "_", tolower(x)) %>%
      # remove single quotes and apostrophes
      gsub("'|’", "", .) %>%
      # validate variables beginning with numbers
      gsub("^(\\d)", "x\\1", .)
  }) %>%
  # distinguish from variable found in time series data
  rename(has_diesel = diesel) %>%
  # creating explicit NA's
  mutate(
    across(
      where(~any(grepl("^N/?A$", ., ignore.case = TRUE))),
      ~replace(., grepl("^N/?A$", ., ignore.case = TRUE), "None")
    )
  ) %>%
  # removing zero-variance variables
  select(-c(c(front_door_count, godfathers_pizza, has_diesel,
  car_wash, ev_charging, non_24_hour, self_check_out)))

4 Dates

The entire data set spans from 2021-01-12 to 2023-08-16 and all 38 stores are present for one year and one day. Is the extra day of any analytical significance or simply an artifact of the fiscal calendar? Given that every store exists for the same length of time, it will be helpful to know the distribution of stores across dates.

# mvts %>%
#   group_by(date) %>%
#   # count of stores for each date
#   summarise(n = n()) %>%
#   ggplot() +
#   geom_col(aes(date, n),
#            fill = "darkred", color = "darkred") +
#   # Fix date axis labels
#   scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
#                labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
#   scale_y_continuous(breaks = seq(0, 30, 5)) +
#   theme_minimal(20) +
#   labs(title = "Count of stores for each date")

# Bar plot
{
  mvts %>%
  # count of stores for each date
  count(date) %>%
  ggplot() +
  geom_col(aes(date, n),
           fill = "darkred", alpha = 0.7, width = 1) +
  # Fix date axis labels
  scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
               labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  theme_minimal(20) +
  labs(title = "Count of stores vs date")
} / # vertically combining plots with `patchwork` package
  # Line plot
  {
    mvts %>%
      arrange(open_date, site_id, date) %>% 
      # rescale site_id for plot
      mutate(site_id = consecutive_id(site_id)) %>%
      ggplot() +
      geom_line(aes(date, site_id, group = site_id),
                color = "steelblue", linewidth = 2, show.legend = FALSE) +
      # Fix date axis labels
      scale_x_date(breaks = seq(as_date("2021-01-01"), as_date("2023-09-01"), "3 months"),
                   labels = ~ifelse(month(.) == 1, format(., "%Y"), format(., "%b"))) +
      scale_y_continuous(breaks = seq(0, 38, 4),
                         minor_breaks = NULL) +
      theme_minimal(20) +
      labs(title = "Store-wise date range",
           y = "Cumulative store count")
  }

Given that open_date is not uniformly distributed, network-wide seasonality will have to be calculated either on a sales per store basis or by standardizing the date by attributing a day ID similar to week_id. Maverik expressed the importance of aligning days in a standardized manner, which is why week_id is included. I’ll begin dissecting the fiscal calendar structure by determining if a singular day of the week begins each week.

mvts %>%
  distinct(date, .keep_all = TRUE) %>%
  # since week_id resets each year, values are not unique to distinct weeks
  group_by(unique_week_id = consecutive_id(week_id)) %>%
  # remove incomplete weeks
  mutate(unique_week_len = n()) %>%
  filter(unique_week_len == 7) %>%
  # subset first day of fiscal week
  summarise_all(first) %>%
  # determine variety 
  count(day_name)
## # A tibble: 1 × 2
##   day_name     n
##   <chr>    <int>
## 1 Friday     134

Now that we know each (complete) week begins on Friday, we can begin constructing a standardized day_id.

# Begin with completed date range found in data
day_id_df <- tibble(date = seq(as_date("2021-01-01"), as_date("2023-12-31"), "1 day")) %>%
  # Calculate week_id
  mutate(week_id = yearweek(date, week_start = 5) %>% format("%V") %>% as.numeric(),
         # since the first day of fiscal year 2022 is actually in 2021, special logic must be 
         # applied to identify the beginning of the year
         x = case_when(lag(week_id, default = 52) == 52 & week_id == 1 ~ 1),
         year = 2020 + rollapplyr(x, width = n(), FUN = sum, na.rm = TRUE, partial = TRUE)) %>%
  group_by(year) %>%
  mutate(day_id = row_number()) %>%
  select(-x) %>%
  ungroup()

day_id_df
# Validating accuracy by comparing to mvts' week_id
day_id_df %>%
  # trimming to mvts' date range
  filter(date >= "2021-01-12",
         date <= "2023-08-16") %>%
  # counting instances of manually calculated week_id and renaming columns
  count(custom_week_id = week_id,
        name = "custom_n") %>%
  bind_cols(mvts %>%
  distinct(date, week_id) %>%
  # counting instances of native week_id and renaming columns
  count(mvts_week_id = week_id,
        name = "mvts_n")) %>%
  # subsetting discrepant rows
  filter(custom_n != mvts_n)

5 Target variables

mvts %>%
  # Aggregate sales
  summarise(across(inside_sales:unleaded, sum)) %>%
  # Convert to long-form
  pivot_longer(everything(),
               names_to = "metric",
               values_to = "sales") %>%
  # get percentage of sales by sales metric
  mutate(p = percent(sales/sum(sales), 0.1))
mvts %>%
  # aggregate sales by site
  group_by(site_id) %>%
  summarise(across(inside_sales:unleaded, sum)) %>%
  # convert to long-form
  pivot_longer(inside_sales:unleaded,
               names_to = "metric",
               values_to = "sales") %>%
  # get percentage of sales by sales metric
  group_by(site_id) %>%
  mutate(p = sales/sum(sales),
         # create separate variable to be used in `arrange`
         d = p[metric == "diesel"]) %>%
  arrange(-d) %>%
  ungroup() %>%
  # ensure stores are arranged by % diesel in plot
  mutate(site_index = consecutive_id(site_id),
         metric = fct_reorder(metric, p)) %>%
  ggplot(aes(site_index, p)) +
  geom_col(aes(fill = metric)) +
  geom_text(aes(label = percent(p, 1), group = metric),
            # place label in center of `group`
            position = position_stack(vjust = 0.5)) +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Proportion of total store sales by sales metric",
       x = "site index (descending diesel contribution)",
       y = NULL)

# Histogram (density)
mvts %>%
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  group_by(metric, site_id) %>%
  summarise(sales = sum(sales)) %>%
  mutate(avg = mean(sales),
         med = median(sales)) %>%
  ggplot() +
  geom_density(aes(sales, color = metric),
               linewidth = 1) +
  #coord_cartesian(xlim = c(0, 2e6)) +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Distribution of key sales metrics")

# Violin
mvts %>%
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  group_by(metric, site_id) %>%
  summarise(sales = sum(sales)) %>%
  ggplot() +
  geom_violin(aes(metric, sales, fill = metric),
              color = "white", draw_quantiles = c(0.25, 0.5, 0.75),
              scale = "area") +
  theme_minimal(20) +
  labs(title = "Distribution of key sales metrics")

# Line
mvts %>%
  # add `day_id`
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  # place all sales into one column, labeled by another
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  # aggregate by `day_id` and sales metric
  group_by(metric, day_id) %>%
  summarise(sales = sum(sales),
            day_type = day_type[1]) %>%
  ggplot() +
  # `group = 1` prevents a separate line for each `day_type`
  geom_line(aes(day_id, sales, color = day_type, group = 1)) +
  # create separate plot for each `metric`
  facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank())

# Line with holiday 

# Categorical line color
mvts %>%
  # add `day_id`
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  group_by(day_id) %>%
  mutate(is_holiday = case_when(holiday != "NONE" ~ "holiday",
                                .default = "regular day")) %>%
  # place all sales into one column, labeled by another
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  # aggregate by `day_id` and sales metric
  group_by(metric, day_id) %>%
  summarise(sales = sum(sales),
            is_holiday = is_holiday[1]) %>% 
  ggplot() +
  # `group = 1` prevents a separate line for each `is_holiday`
  geom_line(aes(day_id, sales, color = is_holiday, group = 1),
            linewidth = 1) +
  # Create generalized date labels
  scale_x_continuous(breaks = c(1, cumsum(days_in_month(as_date(paste0("2021-", month.abb, "-01")))) + 1),
                     labels = ~format(as_date("2021-12-31") + ., "%b %d")) +
  # Rescale labels to thousands
  scale_y_continuous(labels = ~./1e3) +
  # create separate plot for each `metric`
  facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Annualized store sales",
       x = "generalized date",
       y = "sales $ (thousands)")

# Create df for plot to shade holidays
holiday_4gg <- mvts %>%
  # add `day_id`
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  # place all sales into one column, labeled by another
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  # aggregate by `day_id` and sales metric
  group_by(metric, day_id) %>%
  mutate(sales = sum(sales)) %>%
  # aggregate by `metric`
  group_by(metric) %>%
  mutate(sales_max = max(sales),
         sales_min = min(sales)) %>%
  # Keep only holidays
  filter(holiday != "NONE") %>%
  group_by(holiday, metric) %>%
  # get min/max of `day_id` and `sales`
  reframe(day_id = range(day_id),
          sales = c(sales_min[1], sales_max[1]))

mvts %>%
  # add `day_id`
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  # place all sales into one column, labeled by another
  pivot_longer(c(inside_sales, food_service, diesel, unleaded),
               names_to = "metric",
               values_to = "sales") %>%
  # aggregate by `day_id` and sales metric
  group_by(metric, day_id) %>%
  summarise(sales = sum(sales),
            day_type = day_type[1]) %>%
  ggplot() +
  geom_mark_rect(data = holiday_4gg,
                 aes(day_id, sales, 
                     group = interaction(holiday, metric)),
                 expand = unit(1.5, "mm"), radius = unit(0.5, "mm"), 
                 fill = "#000000", alpha = 0.1, color = NA) +
  # `group = 1` prevents a separate line for each `day_type`
  geom_line(aes(day_id, sales, color = day_type, group = 1)) +
  # Rescale labels to thousands
  scale_y_continuous(labels = ~./1e3) +
  # create separate plot for each `metric`
  facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free", ncol = 1) +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Sales vs day_id",
       subtitle = "Shaded regions indicate holidays",
       y = "sales $ (thousands)")

# Correlation
mvts %>%
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  select(inside_sales, food_service, diesel, unleaded) %>%
  ggpairs() +
  labs(title = "Correlation of daily store sales")

mvts %>%
  left_join(day_id_df %>%
              select(date, day_id), "date") %>%
  group_by(site_id) %>%
  summarise(across(c(inside_sales, food_service, diesel, unleaded), sum)) %>%
  ggpairs() +
  labs(title = "Correlation of annualized store sales")

Based on the time series line plot, it’s clear that there is weekly and yearly seasonality but weekly seasonality is more pronounced with weekdays seeing greater sales than weekends. We also see that certain holidays affect sales more than others. This is not surprising but this will certainly have to be accounted for during modeling.

All of the sales metrics besides diesel are fairly normally distributed but still right-skewed. Intuitively, inside_sales and food_service are very positively correlated. The relationship of the other pairs are rather heteroskedastic or otherwise inconsistent. Determining which (if any) features in the available data can explain this irregular variance will be valuable when forecasting.

The scatter plot from the generalized pairs plot reveals that some locations had no sales on at least one day. One may think that these might be stores not open 24 hours a day, but we discovered earlier that only one value for non-24_hour exists in the data and subsequently all stores are open 24 hours a day.

mvts %>%
  group_by(site_id) %>%
  filter(if_any(c(inside_sales, food_service, diesel, unleaded), ~any(.<=0))) %>%
  summarise(across(c(inside_sales, food_service, diesel, unleaded),
                   list(avg = ~mean(.),
                        med = ~median(.),
                        sd = ~sd(.),
                        max = ~max(.),
                        min = ~min(.),
                        dec1 = ~quantile(., 0.1),
                        daysNoSales = ~sum(. == 0)))) %>%
  pivot_longer(-site_id,
               names_to = c("metric", ".value"),
               names_pattern = "(.*)_(.*)")

We can determine that only two stores account for the zero sales values. Based on each stores’ summary statistics, the absence of sales for even a single day seems peculiar. This could be indicative of some disruption to the facilities that is not indicated in the data. It can be inferred that other stores could have possibly also experienced similar, but not complete, disruptions that would affect their sales that we cannot account for. Unfortunately, not much can be done about this and hopefully the scale of such instances are minor.

Conversely, there seems to be at least one store that has dramatically higher sales than the rest which begs the question of what to do with sufficiently outlying data. Before deciding to remove such cases, it will be important to see if the other corresponding data can suggest an explanatory pattern which is valuable for prediction.

mvts %>%
  # distinguish unusual diesel seller
  mutate(big_seller = site_id == 21980,
         alpha = ifelse(big_seller, 1, 0.4)) %>%
  arrange(big_seller) %>%
  ggplot() +
  geom_point(aes(inside_sales, diesel, color = big_seller, alpha = alpha)) +
  # dynamic opacity
  scale_alpha_identity() +
  theme_minimal(20) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Diesel vs Inside Sales, spotlight on Site 21980")

5.1 Relationship with features

# Numerical features
corr_df <- mvts %>%
  group_by(site_id) %>%
  summarise(across(inside_sales:unleaded, sum)) %>%
  left_join(mvq, "site_id") %>%
  select(where(is.numeric), -c(site_id, open_year, years_since_last_project))

corr_df %>%
  cor() %>%
  ggcorrplot::ggcorrplot(method = "circle",
    p.mat = ggcorrplot::cor_pmat(corr_df),
    lab = TRUE,
    lab_size = 3,
    outline.color = "white",
    hc.order = TRUE,
    insig = "blank")

# Sina plot, diesel vs women's sink, store+date level
mvts %>%
  left_join(mvq, "site_id") %>%
  pivot_longer(inside_sales:unleaded,
               names_to = "metric",
               values_to = "sales") %>%
  group_by(womens_sink_count, metric) %>%
  mutate(med = median(sales)) %>%
  ggplot() +
  geom_sina(aes(womens_sink_count, sales, 
                color = after_stat(scaled),
                group = womens_sink_count),
            size = 1, scale = "width") +
  geom_linerange(data = \(x) {x %>%
      summarise_all(~.[1])},
      aes(y = med, xmin = womens_sink_count - 0.25, xmax = womens_sink_count + 0.25),
      linetype = "dashed", linewidth = 1) +
  viridis::scale_color_viridis(guide = guide_colorbar(title.position = "top",
                                                      title.hjust = 0.5)) +
  scale_y_continuous(labels = ~./1000) +
  facet_rep_wrap(~metric, repeat.tick.labels = TRUE, scales = "free") +
  theme_minimal(20) +
  theme(legend.direction = "horizontal",
        legend.position = "top",
        legend.key.width = unit(15, "mm")) +
  labs(title = "Distribution of daily store sales sales\n given count of women's sinks",
       y = "sales $ (thousands)",
       color = "scaled density")

mvts %>%
  left_join(mvq, "site_id") %>%
  mutate(total_pumps = traditional_forecourt_fueling_positions +
           hi_flow_lanes_fueling_positions +
           rv_lanes_fueling_positions) %>%
  pivot_longer(c(contains("position"), total_pumps),
               names_to = "pump_type",
               values_to = "n_pumps") %>%
  pivot_longer(inside_sales:unleaded,
               names_to = "metric",
               values_to = "sales") %>%
  group_by(pump_type, n_pumps, metric) %>%
  mutate(med = median(sales)) %>%
  ggplot() +
  geom_sina(aes(n_pumps, sales, 
                color = after_stat(scaled),
                group = n_pumps),
            size = 0.5, scale = "width") +
  geom_linerange(data = \(x) {x %>%
      #group_by(categ) %>%
      summarise_all(~.[1])},
      aes(y = med, xmin = n_pumps - 0.4, xmax = n_pumps + 0.4),
      linewidth = 0.5, color = "red") +
  viridis::scale_color_viridis(guide = guide_colorbar(title.position = "top",
                                                      title.hjust = 0.5)) +
  scale_y_continuous(labels = ~./1000) +
  facet_rep_grid(metric~pump_type, repeat.tick.labels = TRUE, scales = "free") +
  theme_minimal(20) +
  theme(legend.direction = "horizontal",
        legend.position = "top",
        legend.key.width = unit(15, "mm"),
        plot.margin = unit(c(0,0,0,0), "lines")) +
  labs(title = "Distribution of daily sales given count of fuel pumps",
       y = "sales $ (thousands)",
       color = "scaled density")

5.1.1 Linear regression

lapply(c("inside_sales", "food_service", "diesel", "unleaded"),
       \(x){
         
         xdata <- mvts %>%
           left_join(mvq, "site_id") %>%
           mutate(across(where(is.numeric), ~scale(.)[,1])) %>%
           group_by(site_id) %>%
           mutate(across(inside_sales:unleaded,
                         list(lag = ~lag(., default = last(.))))) %>%
           ungroup() %>%
           select(-c(where(is.Date), site_id, open_year, years_since_last_project)) 
         
         xlm <- lm(
           paste(x, "~ ."),
           data = xdata 
         ) %>%
           summary
         
         out <- xlm$coefficients %>%
           as.data.frame(x = .) %>%
           mutate(var = row.names(.),
                  .before = 1) %>%
           as_tibble() %>%
           mutate(sig = case_when(`Pr(>|t|)` <= 0.001 ~ "***",
                                  `Pr(>|t|)` <= 0.01 ~ "**",
                                  `Pr(>|t|)` <= 0.05 ~ "*",
                                  `Pr(>|t|)` <= 0.1 ~ "."),
                           `Pr(>|t|)` = round(`Pr(>|t|)`, 5),
                  r2 = xlm$r.squared,
                  target = x) %>%
           relocate(target)
       }) %>%
  list_rbind() %>%
  filter(!grepl("Intercept", var),
         grepl("\\*", sig)) %>%
  group_by(target) %>%
  slice_max(Estimate, n = 5)
# Simple LM (one predictor) on annualized sales
lapply(c("inside_sales", "food_service", "diesel", "unleaded"),
       \(x){
         
         xdata <- mvts %>%
           group_by(site_id) %>%
           summarise(across(inside_sales:unleaded, sum)) %>%
           left_join(mvq, "site_id") %>%
           mutate(across(where(is.numeric), ~scale(.)[,1])) %>%
           ungroup() %>%
           select(-c(where(is.Date), site_id, open_year, years_since_last_project))
         
         lapply(colnames(xdata),
                \(y){
                  xlm <- lm(
                    paste(x, "~", y),
                    data = xdata 
                  ) %>%
                    summary
                  
                  out <- xlm$coefficients %>%
                    as.data.frame(x = .) %>%
                    mutate(var = row.names(.),
                           .before = 1) %>%
                    as_tibble() %>%
                    mutate(sig = case_when(`Pr(>|t|)` <= 0.001 ~ "***",
                                           `Pr(>|t|)` <= 0.01 ~ "**",
                                           `Pr(>|t|)` <= 0.05 ~ "*",
                                           `Pr(>|t|)` <= 0.1 ~ "."),
                           `Pr(>|t|)` = round(`Pr(>|t|)`, 5),
                           r2 = xlm$r.squared,
                           target = x) %>%
                    relocate(target)
                }) %>% list_rbind()
         
       }) %>%
  list_rbind() %>%
  filter(!grepl("Intercept", var),
         grepl("\\*", sig)) %>%
  group_by(target) %>%
  slice_max(r2, n = 5)

5.2 Time series decomposition